import pyreadr
import pandas as pd
import numpy as np
import joypy
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import networkx as nx
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import wbdata as wb
import datetime
import warnings
# Suppress benign warnings
warnings.filterwarnings("ignore")

from IPython.display import Image, HTML, display
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,power_transform
from sklearn.decomposition import PCA, FastICA

%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)
# For offline rendering in JupyterLab
# alt.renderers.enable('default')
# alt.data_transformers.enable('default')

Introduction

Policy and media attention on illicit financial flows (IFF) has increased, with the recognition that Africa is a net creditor to the world.

NYT-2013 Guardian-2015
New York Times (2013) Guardian (2015)
Guardian-2017 Economist-2019
Guardian (2017) Economist (2019)

What is trade mis-invoicing?

  • The deliberate mis-statement of price or quantity of internationally traded goods in invoices presented to customs

  • Can occur at import or export

  • Can result in an inflow or outflow of money

Motivations for trade mis-invoicing include:

  • Evading tariffs

  • Exploiting subsidy regimes

  • Hiding transfers of wealth

conceptual-model

Why does trade mis-invoicing matter?

  • Outflows undermine the fiscal base and public spending

  • Financing gap needed to meet the Sustainable Development Goals (SDGs)

  • Combating trade mis-invoicing is crucial for the mobilization of domestic resources in the continent, and can catalyze sustainable development

conceptual-model

Data source

  • trade mis-invoicing data-set of the United Nations Economic Commission for Africa

  • panel with \(n=6,248,254\) of mis-invoiced trade between 179 reporting jurisdictions and 179 partner countries for years 2000-2016, disaggregated commodities (HS2 code)

Methodology for calculating mis-invoiced trade

  • Locate mis-invoicing in the discrepancies between reported trade flows and their mirrored statistics

  • But not all observed discrepancies are due to illicit motives!

  • Imports tend to be recorded on the basis of Cost of Insurance and Freight (CIF), while exports are recorded Free On Board (FOB)

Our approach

  1. Estimate the discrepancies between imports and mirror net exports as a function of both licit and illicit predictors

  2. Perform a harmonization procedure in order to generate a reconciled value that represents our best estimate of what the legitimate value of the trade should be on a FOB basis

  3. Calculate the IFF embedded in each transaction as the difference between the observed value and the reconciled value

Zoom in on Africa

  • During 2000-2016, the continent lost on average $83 billion a year in gross illicit outflows

  • Net cumulative flows during that period were $362 billion

Africa-map

Goals

Goals

  • Perform unsupervised learning on this data to extract meaningful insights

  • Dimension reduction (PCA) - can I recover policy-relevant dimensions of variation?

Data wrangling

  1. Data-set of mis-invoiced trade by sector (unit of observation is reporter-year)

    • View 1: unit of observation = reporter, features = sectors

    • View 2: unit of observation = sectors, features = reporters

  2. Bilateral trade matrix (dyadic data)

Mis-invoiced trade data (for African reporting countries)

panel = pyreadr.read_r('Data/GER_Orig_Sect_Year_Africa.Rdata')
IFF_Sector = panel['GER_Orig_Sect_Year_Africa']
obs_info = IFF_Sector.reset_index().drop_duplicates(['reporter.ISO', 'year'])[['reporter.ISO', 
                                                                               'year', 
                                                                               'reporter', 
                                                                               'rIncome', 
                                                                               'rDev']]
obs_info = obs_info.replace({'LIC': 'Low income', 
                             'LMC': 'Lower-middle income',
                             'UMC': 'Upper-middle income',
                             'HIC': 'High income'})
obs_info = obs_info.rename(columns={'rIncome': 'Income group (World Bank)', 
                                    'rDev': 'Country status (UN)'})
IFF_Sector = IFF_Sector.fillna(0).drop(columns=['reporter', 'rIncome', 'rDev', 'section.code',
                                                'Imp_IFF_lo', 'Exp_IFF_lo']) \
            .set_index(['reporter.ISO', 'year'])

IFF_Sector_mean = IFF_Sector.reset_index().groupby(['reporter.ISO', 'section']).mean(). \
    reset_index().set_index('reporter.ISO')

Mis-invoiced trade for countries by sectors

IFF_Sector_Imp = IFF_Sector[['section', 'Imp_IFF_hi']]
IFF_Sector_Exp = IFF_Sector[['section', 'Exp_IFF_hi']]
IFF_Sector_Imp
section Imp_IFF_hi
reporter.ISO year
DZA 2001 Animal and Animal Products 1.914633e+08
2001 Pulp of Wood or of Other Fibrous Material 7.436143e+07
2001 Textiles 3.560644e+07
2001 Footwear and Headgear 1.146507e+06
2001 Stone, Glass, and Ceramics 2.935880e+07
... ... ... ...
ZWE 2007 Arms and Ammunition 0.000000e+00
2009 Works of Art 0.000000e+00
2013 Miscellaneous Manufactured Articles 0.000000e+00
2014 Miscellaneous Manufactured Articles 0.000000e+00
2014 Mineral Products 0.000000e+00

11133 rows × 2 columns

Mis-invoiced trade data (bilateral trade matrix)

panel = pyreadr.read_r('Data/GER_Orig_Dest_Year_Africa.Rdata')
IFF_Dest = panel['GER_Orig_Dest_Year_Africa']
IFF_Dest = IFF_Dest.fillna(0).drop(columns=['reporter', 'rIncome', 'rDev', 
                                            'partner', 'pRegion', 'pIncome', 'pDev',
                                            'Imp_IFF_lo', 'Exp_IFF_lo']) \
            .set_index(['reporter.ISO', 'year'])

IFF_Dest_mean = IFF_Dest.reset_index().groupby(['reporter.ISO', 'partner.ISO']).mean(). \
    reset_index().set_index('reporter.ISO')

Mis-invoiced trade for dyads

IFF_Dest_Imp = IFF_Dest[['partner.ISO', 'Imp_IFF_hi']]
IFF_Dest_Exp = IFF_Dest[['partner.ISO', 'Exp_IFF_hi']]
IFF_Dest_Imp
partner.ISO Imp_IFF_hi
reporter.ISO year
DZA 2001 AND 1.609561e+04
2001 ARG 4.717459e+07
2001 AUS 2.027641e+07
2001 AUT 7.641706e+07
2001 BEL 2.285729e+07
... ... ... ...
ZWE 2014 MDG 0.000000e+00
2014 SGP 0.000000e+00
2014 CHE 0.000000e+00
2014 ARE 0.000000e+00
2015 UGA 0.000000e+00

34757 rows × 2 columns

Crosswalk data

# This doesn't work on Binder
! wget -qO Data/crosswalk.xlsx https://github.com/walice/Codes-Masterlist/blob/master/Codes_Masterlist.xlsx?raw=true

More data wrangling

crosswalk = pd.read_excel("Data/crosswalk.xlsx").rename(columns={'Country': 'country'})
crosswalk.head()
ISO3166.3 ISO3166.2 country UN_Region UN_Region_Code UN_Sub-region UN_Sub-region_Code UN_Intermediate_Region UN_Intermediate_Region_Code UN_M49_Code ... WB_Income_Group_Code WB_Region WB_Lending_Category WB_Other OECD EU28 Arab League Commonwealth Longitude Latitude
0 ABW AW Aruba Americas 19.0 Latin America and the Caribbean 419.0 Caribbean 29.0 533.0 ... HIC Latin America and Caribbean .. NaN 0 0 0 0 -69.982677 12.520880
1 AFG AF Afghanistan Asia 142.0 Southern Asia 34.0 NaN NaN 4.0 ... LIC South Asia IDA HIPC 0 0 0 0 66.004734 33.835231
2 AFG AF Afghanistan, Islamic Republic of Asia 142.0 Southern Asia 34.0 NaN NaN 4.0 ... LIC South Asia IDA HIPC 0 0 0 0 66.004734 33.835231
3 AGO AO Angola Africa 2.0 Sub-Saharan Africa 202.0 Middle Africa 17.0 24.0 ... LMC Sub-Saharan Africa IBRD NaN 0 0 0 0 17.537368 -12.293361
4 AIA AI Anguila Americas 19.0 Latin America and the Caribbean 419.0 Caribbean 29.0 660.0 ... NaN NaN NaN NaN 0 0 0 0 -63.064989 18.223959

5 rows × 25 columns

Auxiliary functions

def create_features(data, values, features, obs):
    """
    TODO: add doc string
    """
    features_data = data.pivot_table(values=values, 
                                     columns=features, 
                                     index=[obs, 'year'], 
                                     fill_value=0)
    return features_data
def create_features_mean(data, values, features, obs):
    """
    TODO: add doc string
    """
    features_data = data.pivot_table(values=values, 
                                     columns=features, 
                                     index=obs, 
                                     fill_value=0)
    return features_data
def biplot_PCA(features_data, nPC=2, firstPC=1, secondPC=2, obs='reporter.ISO', show_loadings=False):
    """Projects the data in the 2-dimensional space spanned by 2 principal components
    chosen by the user, along with a bi-plot of the top 3 loadings per PC, and colors
    by class label.

    Args:
        features_data: data-set of features
        nPC: number of principal components
        firstPC: integer denoting first principal component to plot in bi-plot
        secondPC: integer denoting second principal component to plot in bi-plot
        obs: string denoting index of class labels (in features_data)
        show_loadings: Boolean indicating whether PCA loadings should be displayed
    Returns:
        plot (interactive)
        pca_loadings (if show_loadings=True)
    """
        
    # Run PCA (standardize data beforehand)
    features_data_std = StandardScaler().fit_transform(features_data)
#     features_data_std = power_transform(features_data, method='yeo-johnson', standardize=True)
    pca = PCA(n_components=nPC, random_state=234)
    princ_comp = pca.fit_transform(features_data_std)

    # Extract PCA loadings
    cols = ['PC' + str(c+1) for c in np.arange(nPC)]
    pca_loadings = pd.DataFrame(pca.components_.T, 
                                columns=cols,
                                index=list(features_data.columns))
   
    # Extract PCA scores
    pca_scores = pd.DataFrame(princ_comp, 
                              columns=cols)
    pca_scores[obs] = features_data.reset_index()[obs].values.tolist()
    pca_scores['year'] = features_data.reset_index()['year'].values.tolist()
    
    score_PC1 = princ_comp[:,firstPC-1]
    score_PC2 = princ_comp[:,secondPC-1]
    
    # Generate plot data
    if obs == 'reporter.ISO':
        plot_data = pd.merge(pca_scores, obs_info, on=[obs, 'year'])
        color_obs = 'reporter'
        tooltip_obs = ['reporter', 'year', 'Income group (World Bank)', 'Country status (UN)']
    else:
        plot_data = pca_scores
        color_obs = 'section'
        tooltip_obs = ['section', 'year']

    # Return chosen PCs to plot
    PC1 = 'PC'+str(firstPC)
    PC2 = 'PC'+str(secondPC)

    # Extract top loadings (in absolute value)
    # TO DO: use dict to iterate over
    toploadings_PC1 = pca_loadings.apply(lambda x: abs(x)).sort_values(by=PC1).tail(3)[[PC1, PC2]]
    toploadings_PC2 = pca_loadings.apply(lambda x: abs(x)).sort_values(by=PC2).tail(3)[[PC1, PC2]]

    originsPC1 = pd.DataFrame({'index':toploadings_PC1.index.tolist(), 
                               PC1: np.zeros(3), 
                               PC2: np.zeros(3)})
    originsPC2 = pd.DataFrame({'index':toploadings_PC2.index.tolist(), 
                               PC1: np.zeros(3), 
                               PC2: np.zeros(3)})
    
    toploadings_PC1 = pd.concat([toploadings_PC1.reset_index(), originsPC1], axis=0)
    toploadings_PC2 = pd.concat([toploadings_PC2.reset_index(), originsPC2], axis=0)

    toploadings_PC1[PC1] = toploadings_PC1[PC1]*max(score_PC1)*1.5
    toploadings_PC1[PC2] = toploadings_PC1[PC2]*max(score_PC2)*1.5
    toploadings_PC2[PC1] = toploadings_PC2[PC1]*max(score_PC1)*1.5
    toploadings_PC2[PC2] = toploadings_PC2[PC2]*max(score_PC2)*1.5
    
    # Project top 3 loadings over the space spanned by 2 principal components
    lines = alt.Chart().mark_line().encode()
    for color, i, dataset in zip(['#440154FF', '#21908CFF'], [0,1], [toploadings_PC1, toploadings_PC2]):
        lines[i] = alt.Chart(dataset).mark_line(color=color).encode(
        x= PC1 +':Q',
        y= PC2 +':Q',
        detail='index'
    ).properties(
        width=500,
        height=500
    )
    
    # Add labels to the loadings
    text=alt.Chart().mark_text().encode()
    for color, i, dataset in zip(['#440154FF', '#21908CFF'], [0, 1], [toploadings_PC1[0:3], toploadings_PC2[0:3]]):
        text[i] = alt.Chart(dataset).mark_text(
                align='left',
                baseline='bottom',
                color=color
            ).encode(
                x= PC1 +':Q',
                y= PC2 +':Q',
                text='index'
            )
    
    # Scatter plot colored by observation class label
    points = alt.Chart(plot_data).mark_circle(size=60).encode(
        x=alt.X(PC1, axis=alt.Axis(title='Principal Component ' + str(firstPC))),
        y=alt.X(PC2, axis=alt.Axis(title='Principal Component ' + str(secondPC))),
        color=alt.Color(color_obs, scale=alt.Scale(scheme='category20b'),
                       legend=alt.Legend(orient='right')),
        tooltip=tooltip_obs
    ).interactive()
    
    # Bind it all together
    chart = (points + lines[0] + lines[1] + text[0] + text[1])    
    chart.display()

    if show_loadings:
        return pca_loadings
def scree_plot(features_data, show_explained_var=False):
    """
    TODO: add comments and docstring
    """
    
    features_data_std = StandardScaler().fit_transform(features_data)
    pca = PCA(n_components=features_data_std.shape[1], random_state=234)
    princ_comp = pca.fit_transform(features_data_std)
    
    explained_var = pd.DataFrame({'PC': np.arange(1,features_data_std.shape[1]+1),
                                  'var': pca.explained_variance_ratio_,
                                  'cumvar': np.cumsum(pca.explained_variance_ratio_)})

    # Adapted from https://altair-viz.github.io/gallery/multiline_tooltip.html
    # Create a selection that chooses the nearest point & selects based on x-value
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['PC'], empty='none')

    # The basic line
    line = alt.Chart(explained_var).mark_line(interpolate='basis', color='#FDE725FF').encode(
        alt.X('PC:Q',
            scale=alt.Scale(domain=(1, len(explained_var))),
            axis=alt.Axis(title='Principal Component')
        ),
        alt.Y('cumvar:Q',
            scale=alt.Scale(domain=(min(explained_var['cumvar']), 1)),
            axis=alt.Axis(title='Cumulative Variance Explained')
        ),
    )

    # Transparent selectors across the chart. This is what tells us
    # the x-value of the cursor
    selectors = alt.Chart(explained_var).mark_point().encode(
        x='PC:Q',
        opacity=alt.value(0),
    ).add_selection(
        nearest
    )

    # Draw points on the line, and highlight based on selection
    points = line.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )

    # Draw text labels near the points, and highlight based on selection
    text = line.mark_text(align='left', dx=5, dy=-5).encode(
        text=alt.condition(nearest, 'cumvar:Q', alt.value(' '))
    )

    # Draw a rule at the location of the selection
    rules = alt.Chart(explained_var).mark_rule(color='gray').encode(
        x='PC:Q',
    ).transform_filter(
        nearest
    )

    # Put the five layers into a chart and bind the data
    out = alt.layer(
        line, selectors, points, rules, text
    ).properties(
        title='Cumulative scree plot',
        width=600, height=300
    )
    
    out.display()
    
    if show_explained_var:
        return explained_var[['PC', 'var']]

PCA on feature space (for individual reporting countries)

Sector features

sector_features = create_features(IFF_Sector_Imp, 'Imp_IFF_hi', 
                                  features='section', obs='reporter.ISO')
sector_features
section Animal and Animal Products Animal or Vegetable Fats and Oils Arms and Ammunition Base Metals Chemicals and Allied Industries Footwear and Headgear Machinery and Electrical Mineral Products Miscellaneous Manufactured Articles Pearls, Precious Stones and Metals ... Precision Instruments Prepared Foodstuffs Pulp of Wood or of Other Fibrous Material Raw Hides, Skins, Leather, and Furs Stone, Glass, and Ceramics Textiles Transportation Vegetable Products Wood and Wood Products Works of Art
reporter.ISO year
AGO 2009 2.694946e+08 7.652455e+07 67738.261242 1.025920e+09 5.115176e+08 2.920410e+07 1.525512e+09 1.727293e+09 2.472936e+08 5.575633e+05 ... 1.242156e+08 4.781184e+08 1.149992e+08 1.097769e+07 1.349018e+08 1.827787e+08 2.198838e+09 4.249722e+08 4.337043e+07 177640.929155
2010 2.064906e+08 7.311396e+07 0.000000 9.989842e+08 2.575443e+08 7.853065e+06 1.308337e+09 2.953387e+09 8.404302e+07 3.209177e+05 ... 1.115183e+08 2.342889e+08 7.332134e+07 4.013044e+06 7.601278e+07 8.285704e+07 9.611656e+08 1.959128e+08 2.310136e+07 404828.231824
2011 2.995594e+08 8.058180e+07 89365.880480 6.689511e+08 2.931591e+08 1.172881e+07 1.389743e+09 2.286539e+09 4.413341e+07 9.645181e+05 ... 1.418956e+08 3.159980e+08 9.911835e+07 9.995669e+06 6.607126e+07 7.912925e+07 7.070474e+08 3.734589e+08 1.363601e+07 586586.948365
2012 7.103770e+08 2.746924e+08 261265.895948 1.753970e+09 8.540407e+08 3.823724e+07 2.790368e+09 9.203666e+08 1.763904e+08 1.271289e+06 ... 2.073111e+08 1.025668e+09 2.641831e+08 1.394356e+07 1.630307e+08 1.683128e+08 1.975553e+09 6.541154e+08 3.980884e+07 739122.050820
2013 4.031560e+08 1.541679e+08 188343.533148 9.230300e+08 4.793908e+08 1.212363e+07 1.428283e+09 2.009915e+09 4.871315e+07 6.250082e+05 ... 1.063123e+08 4.931687e+08 1.798533e+08 8.930246e+06 7.882264e+07 7.858108e+07 4.962937e+09 3.995114e+08 1.836680e+07 418841.640492
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ZWE 2011 3.098093e+07 6.060789e+07 9124.882478 1.693777e+08 2.260227e+09 2.817909e+06 3.231869e+08 2.747704e+08 9.872441e+06 3.618592e+06 ... 2.136919e+07 1.144497e+08 5.855387e+07 6.259512e+05 2.652474e+07 4.726781e+07 8.963644e+08 1.874874e+08 6.016952e+06 35481.251510
2012 2.784481e+07 5.549232e+07 856.631347 1.085453e+08 4.929549e+08 7.015919e+06 3.451931e+08 1.461844e+08 1.510668e+07 2.876014e+05 ... 3.162707e+07 2.025934e+08 6.024629e+07 1.412797e+06 2.482629e+07 5.147542e+07 9.468142e+08 1.900125e+08 7.901902e+06 59677.045599
2013 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000
2014 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000
2015 1.458046e+07 7.135043e+07 19192.507302 8.703951e+07 2.299085e+08 4.468801e+06 3.437244e+08 1.401916e+08 2.432263e+07 3.672531e+07 ... 3.233606e+07 3.778318e+07 3.157795e+07 1.061420e+06 1.724449e+07 2.841926e+07 2.137692e+08 1.585549e+08 4.569325e+06 6305.070602

624 rows × 21 columns

Biplots

biplot_PCA(sector_features, 10, 1, 2, obs='reporter.ISO', show_loadings=True)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
Animal and Animal Products 0.223097 -0.285686 -0.052218 0.200337 0.280789 -0.075192 -0.159824 0.157986 -0.133621 0.067581
Animal or Vegetable Fats and Oils 0.145805 -0.041750 0.262100 -0.281551 -0.378544 -0.470151 0.409970 0.471670 0.177591 -0.031639
Arms and Ammunition 0.030559 -0.044358 0.524677 -0.654182 0.434354 0.055627 -0.201368 -0.113006 -0.079142 -0.115541
Base Metals 0.235966 -0.206534 -0.261180 -0.147777 -0.019680 -0.153272 0.017922 -0.247084 0.174715 -0.069873
Chemicals and Allied Industries 0.271238 0.107591 -0.117222 -0.089472 -0.096976 -0.130645 -0.071165 -0.219172 -0.198166 0.036517
Footwear and Headgear 0.206357 0.378112 0.015170 -0.019961 -0.001383 -0.029500 -0.042735 0.010544 0.026510 -0.252435
Machinery and Electrical 0.272101 0.160466 0.076229 0.091921 -0.067637 -0.159494 0.022549 -0.135941 -0.157103 -0.115312
Mineral Products 0.238785 0.071925 -0.098357 -0.201011 -0.166463 0.073908 -0.133348 0.121464 -0.241190 0.675042
Miscellaneous Manufactured Articles 0.246123 0.283544 0.122775 0.033592 -0.049743 -0.014818 -0.017630 0.069480 -0.247438 -0.043076
Pearls, Precious Stones and Metals 0.060193 0.251657 -0.373387 -0.153599 0.567863 0.055855 0.647708 0.088517 -0.049224 0.089860
Plastics and Rubbers 0.264093 -0.144764 0.124109 0.236111 0.112342 0.023331 0.062706 0.076398 0.119468 -0.076649
Precision Instruments 0.223754 0.362288 -0.013533 0.023957 -0.084425 -0.090796 -0.080419 -0.186415 -0.262231 -0.078130
Prepared Foodstuffs 0.252722 -0.227483 -0.012139 0.160146 0.177798 -0.024717 -0.088546 0.289203 -0.187744 -0.058613
Pulp of Wood or of Other Fibrous Material 0.272304 -0.112890 -0.010570 0.074023 0.024044 -0.016865 0.003323 -0.123810 0.135842 0.200798
Raw Hides, Skins, Leather, and Furs 0.206454 0.084348 0.178427 0.121865 -0.101317 0.619649 0.102592 0.287981 -0.057302 -0.251296
Stone, Glass, and Ceramics 0.225603 0.111290 0.264504 0.269760 0.061210 -0.018303 0.176815 -0.368090 0.400752 -0.018317
Textiles 0.209936 -0.103101 -0.054321 -0.264334 -0.273480 0.542184 0.126078 -0.076346 0.216014 0.139611
Transportation 0.238443 -0.119628 0.256225 0.081261 0.184539 -0.059237 0.015553 -0.095856 0.219874 0.334486
Vegetable Products 0.245486 -0.284943 -0.122225 -0.029458 0.071482 -0.036325 -0.074223 0.202113 -0.106308 -0.240274
Wood and Wood Products 0.201178 -0.222154 -0.377723 -0.300290 -0.132165 0.007197 -0.048891 -0.185469 0.074259 -0.351532
Works of Art 0.089055 0.388758 -0.239308 -0.078967 0.157495 -0.041382 -0.486227 0.369405 0.561157 0.018339

treemap-sectors

biplot_PCA(sector_features, 10, 3, 4, obs='reporter.ISO')
biplot_PCA(sector_features, 10, 5, 6, obs='reporter.ISO')

Explained variance

scree_plot(sector_features, show_explained_var=True)
PC var
0 1 0.524988
1 2 0.121415
2 3 0.054159
3 4 0.049898
4 5 0.042107
5 6 0.041007
6 7 0.036014
7 8 0.028847
8 9 0.024146
9 10 0.018279
10 11 0.012558
11 12 0.009987
12 13 0.007011
13 14 0.006890
14 15 0.005213
15 16 0.004848
16 17 0.004001
17 18 0.002819
18 19 0.002489
19 20 0.001800
20 21 0.001524

Variance-stabilizing and normalizing transformations

# fig, axes = joypy.joyplot(sector_features, colormap=plt.cm.viridis, figsize=(8,8));
sector_features_log = sector_features.apply(lambda x: np.log(x+1) if np.issubdtype(x.dtype, np.number) else x, axis=0)
# fig, axes = joypy.joyplot(sector_features_log, colormap=plt.cm.viridis, figsize=(8,8));
sector_features_yeo = power_transform(sector_features, method='yeo-johnson', standardize=True)
sector_features_yeo = pd.DataFrame(sector_features_yeo,
                                   index=sector_features.index,
                                   columns=sector_features.columns)
# fig, axes = joypy.joyplot(sector_features_yeo, colormap=plt.cm.viridis, figsize=(8,8));
biplot_PCA(sector_features, 10, 1, 2, obs='reporter.ISO')
biplot_PCA(sector_features_log, 10, 1, 2, obs='reporter.ISO')
biplot_PCA(sector_features_yeo, 10, 1, 2, obs='reporter.ISO')

Country features

country_features = create_features(IFF_Sector_Imp, 'Imp_IFF_hi',
                                   features='reporter.ISO', obs='section')
country_features
reporter.ISO AGO BDI BEN BFA BWA CAF CIV CMR COG COM ... STP SWZ SYC TGO TUN TZA UGA ZAF ZMB ZWE
section year
Animal and Animal Products 2000 0.000000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000e+00 0.00000 0.0 ... 0.000000 0.000000 0.000000e+00 4.044600e+06 1.572640e+07 1.060197e+06 2.902091e+05 2.341209e+07 0.000000e+00 0.000000e+00
2001 0.000000 0.000000 9.464068e+06 2.013378e+06 4.018742e+06 351958.882652 7.103384e+07 2.552567e+07 0.00000 0.0 ... 0.000000 0.000000 3.439935e+07 1.087398e+07 0.000000e+00 0.000000e+00 2.522664e+05 1.854458e+07 1.832385e+06 0.000000e+00
2002 0.000000 288779.572593 1.458450e+07 7.756380e+05 3.524820e+06 213560.077651 7.767896e+07 2.907597e+07 0.00000 0.0 ... 0.000000 0.000000 0.000000e+00 6.574580e+06 9.664435e+06 0.000000e+00 6.944404e+05 1.452489e+07 4.354400e+05 7.346496e+06
2003 0.000000 217920.960357 2.297721e+07 4.533317e+05 0.000000e+00 0.000000 1.286866e+08 3.485783e+07 0.00000 0.0 ... 0.000000 0.000000 0.000000e+00 6.290272e+06 1.732785e+07 1.294665e+06 1.358896e+06 2.153838e+07 1.422125e+06 0.000000e+00
2004 0.000000 0.000000 2.742553e+07 3.429694e+05 5.275531e+04 0.000000 9.283012e+07 4.940023e+07 0.00000 0.0 ... 0.000000 59632.763605 0.000000e+00 3.901862e+06 3.574647e+07 0.000000e+00 8.053124e+05 2.868016e+07 2.791570e+06 0.000000e+00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Works of Art 2012 739122.050820 0.000000 0.000000e+00 1.054676e+04 2.623004e+05 0.000000 1.820478e+04 3.875133e+05 0.00000 0.0 ... 0.000000 0.000000 0.000000e+00 1.125696e+03 0.000000e+00 7.252236e+04 1.388514e+04 1.371564e+07 3.453691e+04 5.967705e+04
2013 418841.640492 0.000000 0.000000e+00 0.000000e+00 2.282032e+04 0.000000 1.287629e+04 2.104949e+04 0.00000 0.0 ... 9584.734907 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 5.191721e+05 2.986963e+04 1.006657e+07 1.017682e+05 0.000000e+00
2014 390140.730849 0.000000 0.000000e+00 0.000000e+00 9.331590e+04 0.000000 1.017252e+06 1.826816e+05 8003.37246 0.0 ... 20941.755221 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 6.731774e+04 2.129568e+04 2.543377e+07 0.000000e+00 0.000000e+00
2015 0.000000 0.000000 0.000000e+00 0.000000e+00 5.977903e+04 0.000000 0.000000e+00 0.000000e+00 0.00000 0.0 ... 4447.368931 0.000000 5.857979e+05 0.000000e+00 3.088086e+03 5.385830e+05 1.193596e+04 6.697451e+06 5.843484e+04 6.305071e+03
2016 0.000000 0.000000 0.000000e+00 0.000000e+00 4.195233e+04 0.000000 0.000000e+00 0.000000e+00 0.00000 0.0 ... 1327.652396 0.000000 4.463501e+05 0.000000e+00 0.000000e+00 3.443084e+04 2.305443e+03 1.295357e+07 0.000000e+00 0.000000e+00

357 rows × 46 columns

Biplots

biplot_PCA(country_features, 10, 1, 2, obs='section', show_loadings=True)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
AGO 0.172866 5.695754e-02 1.915290e-01 -6.571267e-02 -1.498597e-02 2.683983e-01 -6.131957e-02 -9.901379e-02 -1.352142e-02 1.960619e-01
BDI 0.159396 1.399172e-02 1.100982e-01 1.072287e-01 -5.780584e-02 -3.006342e-02 1.311807e-01 -6.306715e-02 8.380471e-02 -1.595603e-01
BEN 0.166261 2.839527e-01 -4.853377e-02 -1.227506e-01 -8.929173e-02 2.840864e-02 -5.069704e-02 -1.050768e-01 1.115173e-01 -1.479526e-01
BFA 0.200912 1.961349e-01 -1.479219e-02 -3.284903e-02 -7.687652e-03 1.947496e-01 -8.961797e-03 -9.812748e-02 5.298346e-02 -6.100222e-02
BWA 0.058704 5.101605e-04 1.048248e-01 1.176491e-01 1.116177e-01 4.542630e-02 5.291913e-02 1.702242e-01 1.074927e-01 6.990029e-01
CAF 0.136213 -2.941790e-01 7.144585e-02 1.466869e-01 1.005191e-01 -8.211151e-04 1.767076e-02 -9.841998e-02 4.459720e-02 -6.455230e-02
CIV 0.170157 -4.190252e-02 -1.992766e-01 -1.178971e-01 -8.139640e-02 2.099028e-01 1.481461e-01 -9.124417e-02 9.430036e-02 -2.959315e-02
CMR 0.199563 6.685309e-03 2.723196e-02 -1.591573e-01 -1.158744e-01 4.651018e-02 4.997217e-02 -4.900403e-02 6.736277e-02 -4.601119e-02
COG 0.092230 -1.314851e-01 3.146417e-01 -1.361157e-01 -2.737181e-01 2.682128e-01 -4.522047e-02 1.802042e-02 4.328099e-02 3.111778e-02
COM 0.096409 -2.063490e-01 2.921122e-01 -2.226963e-01 -7.053853e-02 -1.456883e-01 8.379586e-02 -6.576844e-02 9.280223e-02 -1.472937e-01
CPV 0.162562 -1.932140e-01 3.911200e-02 -6.076881e-02 6.992487e-02 3.542767e-02 2.890140e-01 -1.259045e-01 5.085305e-02 -4.426356e-02
DZA 0.097941 4.125012e-02 -3.694645e-02 4.185959e-01 -2.291923e-01 -2.224267e-01 -7.905464e-02 8.880404e-02 -9.011562e-02 -5.841375e-02
EGY 0.175564 1.852732e-01 9.778806e-02 1.870187e-01 8.946189e-03 -2.924499e-03 1.181851e-02 -4.328654e-02 4.339753e-03 -1.929171e-01
ETH 0.121994 -1.897909e-01 7.695418e-02 3.167105e-01 2.484336e-01 -1.802844e-01 -7.872203e-02 -9.597359e-02 -2.026803e-02 -1.094738e-01
GAB 0.042052 -1.989049e-01 -3.375917e-01 1.247060e-02 -2.588424e-01 -4.558121e-02 8.313195e-02 9.621236e-02 -7.997286e-02 -1.514322e-01
GHA 0.107784 -1.996829e-01 -6.503997e-02 6.007675e-02 -3.272539e-01 2.451643e-01 4.926298e-02 2.807364e-01 -3.340735e-01 -5.059900e-03
GIN 0.052400 -1.350973e-01 -3.351174e-01 -8.155761e-02 -4.401417e-02 -4.334381e-02 -3.444465e-01 -1.967460e-02 1.193137e-01 -2.723141e-02
GMB 0.162689 8.598690e-02 -9.929674e-02 -2.316451e-01 2.262847e-01 1.330992e-01 2.662606e-01 1.742886e-01 -9.401992e-02 -6.012329e-02
GNB 0.000016 -1.996604e-02 -8.847847e-02 -2.131943e-02 6.445089e-02 9.726163e-02 -2.861441e-01 3.631337e-01 5.592464e-01 -2.128313e-01
KEN 0.117135 -1.733487e-01 -2.826062e-01 -1.414929e-01 -4.852900e-02 1.122855e-01 -1.773947e-01 -2.061400e-01 -2.190356e-01 1.721080e-01
LBY 0.000000 0.000000e+00 -2.584939e-26 -1.654361e-24 2.710505e-20 2.710505e-20 4.336809e-19 -6.938894e-18 5.551115e-17 -2.775558e-17
LSO 0.031001 -1.145362e-01 1.172217e-01 -8.783260e-02 -6.316290e-02 1.845925e-01 -1.155909e-01 4.273390e-01 1.846813e-01 -2.597209e-02
MAR 0.196450 -1.442017e-03 -1.407898e-01 -1.151253e-01 4.096951e-02 -2.501230e-01 -1.218791e-01 -1.159986e-01 -2.772756e-02 9.286997e-02
MDG 0.187159 1.768800e-01 -7.603241e-02 1.954600e-02 3.882583e-02 -6.632947e-02 1.303028e-02 1.697959e-01 3.997142e-02 -1.389174e-01
MLI 0.160911 -1.212446e-01 -4.690084e-02 -1.486856e-01 8.742975e-02 -2.762972e-01 8.729569e-02 2.434701e-01 -8.316575e-02 3.368757e-02
MOZ 0.147230 1.608981e-01 9.473738e-02 -1.983947e-02 -2.406356e-01 -2.654582e-01 -2.507807e-03 5.991142e-02 9.261240e-02 1.184466e-01
MRT 0.142351 2.001775e-01 7.014463e-02 -1.332407e-01 -2.101042e-01 -3.003770e-01 -1.237907e-01 -9.258926e-02 7.149717e-02 1.249696e-01
MUS 0.183529 7.441746e-02 -1.253785e-01 7.372575e-02 1.275615e-01 4.308231e-02 8.698864e-02 1.594759e-01 3.117765e-02 8.570793e-02
MWI 0.196370 1.208545e-01 -1.804584e-02 -1.462830e-01 2.430288e-02 7.000297e-02 -7.395843e-02 -3.780468e-02 -6.553461e-02 -1.841761e-02
NAM 0.173960 1.201903e-01 5.978034e-02 9.556719e-02 2.108371e-01 -2.454492e-02 2.251590e-01 1.968563e-01 -5.019903e-04 1.281223e-01
NER 0.191899 -1.071062e-01 1.036822e-01 -3.490165e-02 8.799988e-03 4.101665e-02 -1.036213e-01 -1.501394e-01 5.717845e-02 7.629436e-02
NGA 0.153000 -8.916565e-02 2.197113e-01 2.082393e-02 -2.395585e-01 -1.659439e-01 -1.019044e-01 1.527359e-01 -3.809505e-02 8.557721e-02
RWA 0.186734 -3.747262e-02 1.608489e-01 2.275355e-01 3.285765e-02 9.220145e-02 -1.415166e-01 -1.482959e-01 -3.582179e-03 2.383508e-02
SDN 0.122997 -2.804786e-01 1.221158e-01 -1.345090e-01 -5.996290e-02 -1.354197e-01 -2.219003e-02 1.378054e-01 -7.715869e-02 7.151525e-04
SEN 0.198048 -3.415201e-02 -1.017232e-01 -1.030800e-01 -1.082653e-01 -2.190300e-02 -1.642827e-02 -9.561341e-02 -9.388535e-02 5.110459e-02
SLE 0.000000 -5.204546e-35 8.399678e-33 1.376288e-30 -3.947723e-26 -8.283784e-26 -3.475898e-24 3.838377e-22 -4.456022e-21 -1.660979e-21
STP 0.093915 2.692090e-01 7.292042e-02 2.717052e-01 -2.160666e-01 2.404257e-01 -1.858177e-02 -4.529530e-03 -4.636809e-02 -6.325914e-02
SWZ 0.004459 -9.037762e-02 -2.172397e-01 4.529566e-02 -1.475804e-01 -4.240062e-03 1.559841e-01 -2.357850e-01 5.362103e-01 2.615336e-01
SYC 0.012375 -7.886680e-02 -1.900444e-01 2.392704e-01 -2.750048e-01 -4.851200e-02 5.049505e-01 2.850039e-02 1.579691e-01 1.255474e-02
TGO 0.184941 8.829184e-02 -4.195564e-02 1.516059e-02 -3.407124e-02 -7.070676e-02 -2.815878e-02 -5.328573e-03 -7.180032e-03 -5.552878e-02
TUN 0.189500 -1.490447e-01 -5.294429e-02 1.368356e-01 1.828756e-01 1.829014e-02 -1.057055e-01 1.539954e-01 1.181978e-02 -9.518775e-04
TZA 0.198557 1.129045e-01 -8.978178e-02 -1.479353e-01 1.914884e-01 -8.737537e-02 1.348190e-01 8.895631e-02 -7.496216e-02 2.430973e-02
UGA 0.220482 1.310309e-01 -7.584076e-02 3.261941e-02 7.776106e-02 2.816737e-02 -2.272271e-02 3.592921e-03 -2.836216e-02 -7.275351e-02
ZAF 0.190412 -7.411430e-02 -1.422615e-01 1.624323e-01 5.822959e-02 -5.136024e-02 -2.069728e-01 5.163535e-03 -1.084788e-02 1.159689e-01
ZMB 0.150023 -1.782886e-01 -7.122649e-02 2.088026e-01 1.466777e-01 2.720530e-01 -5.921551e-02 -1.225385e-01 6.453899e-03 -3.512622e-02
ZWE 0.098964 -1.998928e-01 2.092424e-01 -4.305728e-02 1.206815e-01 -8.504574e-02 1.333520e-01 -1.496703e-01 1.545886e-01 -1.980936e-01
biplot_PCA(country_features, 10, 3, 4, obs='section')
biplot_PCA(country_features, 10, 5, 6, obs='section')

Explained variance

scree_plot(country_features)

Variance-stabilizing and normalizing transformations

country_features.apply(lambda x: (x == 0).all(), axis=0)
reporter.ISO
AGO    False
BDI    False
BEN    False
BFA    False
BWA    False
CAF    False
CIV    False
CMR    False
COG    False
COM    False
CPV    False
DZA    False
EGY    False
ETH    False
GAB    False
GHA    False
GIN    False
GMB    False
GNB    False
KEN    False
LBY     True
LSO    False
MAR    False
MDG    False
MLI    False
MOZ    False
MRT    False
MUS    False
MWI    False
NAM    False
NER    False
NGA    False
RWA    False
SDN    False
SEN    False
SLE     True
STP    False
SWZ    False
SYC    False
TGO    False
TUN    False
TZA    False
UGA    False
ZAF    False
ZMB    False
ZWE    False
dtype: bool
country_features = country_features.drop(columns=['LBY', 'SLE'])
# fig, axes = joypy.joyplot(country_features.iloc[:,0:22], colormap=plt.cm.magma, figsize=(8,8));
# fig, axes = joypy.joyplot(country_features.iloc[:,22:47], colormap=plt.cm.magma, figsize=(8,8));
country_features_log = country_features.apply(lambda x: np.log(x+1) if np.issubdtype(x.dtype, np.number) else x, axis=0)
# fig, axes = joypy.joyplot(country_features_log.iloc[:,0:22], colormap=plt.cm.magma, figsize=(8,8));
# fig, axes = joypy.joyplot(country_features_log.iloc[:,22:47], colormap=plt.cm.magma, figsize=(8,8));
country_features_yeo = power_transform(country_features, method='yeo-johnson', standardize=True)
country_features_yeo = pd.DataFrame(country_features_yeo,
                                    index=country_features.index,
                                    columns=country_features.columns)
# fig, axes = joypy.joyplot(country_features_yeo.iloc[:,0:22], colormap=plt.cm.magma, figsize=(8,8));
# fig, axes = joypy.joyplot(country_features_yeo.iloc[:,22:47], colormap=plt.cm.magma, figsize=(8,8));
biplot_PCA(country_features_log, 10, 1, 2, obs='section')
biplot_PCA(country_features_yeo, 10, 1, 2, obs='section')

PCA on bilateral trade matrix

partner_features = create_features(IFF_Dest_Imp, 'Imp_IFF_hi', 
                                   features='partner.ISO', obs='reporter.ISO')
partner_features
partner.ISO AGO ALB AND ARE ARG ARM ATG AUS AUT AZE ... URY USA VCT VEN VNM VUT YEM ZAF ZMB ZWE
reporter.ISO year
AGO 2009 0.0 0.0 0.0 0.000000e+00 6.014820e+07 0.0 0 6.980492e+06 3.119279e+05 0.000000 ... 2.361993e+06 9.270820e+08 0 0.0 4.734647e+07 0 0.000000 1.016827e+09 1.060946e+05 309894.487818
2010 0.0 0.0 0.0 0.000000e+00 2.963158e+07 0.0 0 2.971282e+06 5.626929e+06 18935.636990 ... 1.644130e+06 8.268511e+08 0 0.0 2.970677e+07 0 0.000000 3.315930e+08 2.665205e+04 420951.156359
2011 0.0 0.0 0.0 0.000000e+00 5.044258e+07 0.0 0 9.077825e+06 2.409418e+06 20266.842796 ... 1.016561e+06 8.601995e+08 0 0.0 2.451965e+07 0 0.000000 3.281534e+08 4.525653e+05 0.000000
2012 0.0 0.0 0.0 0.000000e+00 2.055669e+08 0.0 0 7.483480e+06 1.354856e+07 22457.788787 ... 1.582490e+06 1.441162e+09 0 0.0 5.942382e+07 0 354866.063494 7.944924e+08 1.180964e+03 0.000000
2013 0.0 0.0 0.0 0.000000e+00 5.839522e+07 0.0 0 1.569576e+07 8.770795e+06 1734.698687 ... 3.447835e+06 7.139003e+08 0 0.0 3.931132e+07 0 0.000000 4.272753e+08 2.855892e+05 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ZWE 2011 0.0 0.0 0.0 5.290029e+07 2.310119e+07 0.0 0 2.325259e+06 3.171896e+05 0.000000 ... 2.721346e+04 6.462989e+08 0 0.0 5.288219e+06 0 0.000000 3.088036e+09 5.016118e+07 0.000000
2012 0.0 0.0 0.0 5.923175e+07 1.011059e+07 0.0 0 1.664879e+06 1.220841e+06 0.000000 ... 0.000000e+00 6.069423e+08 0 0.0 4.745996e+06 0 0.000000 1.242790e+09 1.704507e+08 0.000000
2013 0.0 0.0 0.0 0.000000e+00 0.000000e+00 0.0 0 0.000000e+00 0.000000e+00 0.000000 ... 0.000000e+00 0.000000e+00 0 0.0 0.000000e+00 0 0.000000 0.000000e+00 0.000000e+00 0.000000
2014 0.0 0.0 0.0 0.000000e+00 0.000000e+00 0.0 0 0.000000e+00 0.000000e+00 0.000000 ... 0.000000e+00 0.000000e+00 0 0.0 0.000000e+00 0 0.000000 0.000000e+00 0.000000e+00 0.000000
2015 0.0 0.0 0.0 5.042948e+07 0.000000e+00 0.0 0 5.764013e+05 6.721992e+05 0.000000 ... 9.012925e+04 4.109564e+07 0 0.0 3.093477e+03 0 0.000000 6.474374e+08 0.000000e+00 0.000000

624 rows × 167 columns

Biplots

biplot_PCA(partner_features, 10, 1, 2, show_loadings=True)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
AGO 0.064538 -0.127775 -0.044675 -0.081190 0.015199 -0.030508 -0.020684 -0.075254 0.026087 0.018444
ALB 0.044858 0.100240 -0.085332 0.010378 0.222166 -0.056160 0.016405 -0.068677 0.098802 -0.006931
AND 0.002434 0.006593 -0.006567 0.005445 -0.015707 0.021064 -0.006253 -0.004508 0.001468 -0.016877
ARE 0.091433 -0.038099 0.067933 -0.046062 0.123878 -0.061712 -0.030800 0.190913 -0.147428 0.060691
ARG 0.129513 0.139207 -0.082968 0.019367 -0.043149 0.085106 -0.014389 0.038525 0.024086 0.000942
... ... ... ... ... ... ... ... ... ... ...
VUT 0.000000 0.000000 0.000000 0.000000 -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
YEM 0.040005 -0.060918 0.007451 0.143530 0.037673 0.037506 0.085598 0.261330 0.309931 -0.017295
ZAF 0.008847 0.007834 0.091261 -0.022447 0.026785 0.001632 -0.043753 0.104236 -0.037778 0.124183
ZMB 0.040152 0.020620 -0.037492 0.009481 0.093282 -0.176393 -0.033166 0.069316 -0.109787 0.018559
ZWE 0.058102 -0.112442 0.020694 0.215291 0.042842 0.010566 0.044425 0.149634 0.182897 0.017658

167 rows × 10 columns

top-destinations

biplot_PCA(partner_features, 10, 3, 4)
biplot_PCA(partner_features, 10, 5, 6)

Explained variance

scree_plot(partner_features)

Variance-stabilizing and normalizing transformations

partner_features.apply(lambda x: (x == 0).all(), axis=0)
partner.ISO
AGO    False
ALB    False
AND    False
ARE    False
ARG    False
       ...  
VUT     True
YEM    False
ZAF    False
ZMB    False
ZWE    False
Length: 167, dtype: bool
noIFFpartner = partner_features.loc[:,partner_features.apply(lambda x: (x == 0).all(), axis=0)].columns.tolist()
noIFFpartner
['ATG',
 'BLZ',
 'BMU',
 'DMA',
 'GNB',
 'GRD',
 'KGZ',
 'KNA',
 'LCA',
 'SLB',
 'TON',
 'VCT',
 'VUT']
partner_features = partner_features.drop(columns=noIFFpartner)
# fig, axes = joypy.joyplot(partner_features.iloc[:,0:20], colormap=plt.cm.rainbow);
partner_features_log = partner_features.apply(lambda x: np.log(x+1) if np.issubdtype(x.dtype, np.number) else x, axis=1)
# fig, axes = joypy.joyplot(partner_features_log.iloc[:,0:20], colormap=plt.cm.rainbow, figsize=(8,8));
biplot_PCA(partner_features_log, 10, 1, 2)

Intra-African illicit financial flows

Data wrangling

panel = pyreadr.read_r('Data/GER_Orig_Dest_Year_Africa.Rdata')
IFF_Dest = panel['GER_Orig_Dest_Year_Africa']
IFF_Dest_AFR = IFF_Dest[IFF_Dest['pRegion'] == 'Africa']
IFF_Dest_AFR = IFF_Dest_AFR.fillna(0).drop(columns=['reporter', 'rIncome', 'rDev', 
                                                    'partner', 'pRegion', 'pIncome', 'pDev',
                                                    'Imp_IFF_lo', 'Exp_IFF_lo']) \
            .set_index(['reporter.ISO', 'year'])

IFF_Dest_Imp_AFR = IFF_Dest_AFR[['partner.ISO', 'Imp_IFF_hi']]
IFF_Dest_Exp_AFR = IFF_Dest_AFR[['partner.ISO', 'Exp_IFF_hi']]
partner_features_AFR = create_features(IFF_Dest_Imp_AFR, 'Imp_IFF_hi', 
                                       features='partner.ISO', obs='reporter.ISO')
partner_features_AFR
partner.ISO AGO BDI BEN BFA BWA CAF CIV CMR COG COM ... STP SWZ SYC TGO TUN TZA UGA ZAF ZMB ZWE
reporter.ISO year
AGO 2009 0.0 0.0 0.0 0.0 0.000000e+00 0.0 7.356525e+05 0.000000 0.000000e+00 0.0 ... 0.000000 0.0 0.0 0.000000 1.628157e+06 0.000000e+00 0.0 1.016827e+09 1.060946e+05 309894.487818
2010 0.0 0.0 0.0 0.0 2.025235e+05 0.0 5.577699e+07 53088.378146 1.402793e+08 0.0 ... 0.000000 0.0 0.0 252999.300576 1.137066e+08 5.529461e+03 0.0 3.315930e+08 2.665205e+04 420951.156359
2011 0.0 0.0 0.0 0.0 5.501246e+03 0.0 6.262938e+06 257786.746458 3.466061e+07 0.0 ... 0.000000 0.0 0.0 0.000000 4.623718e+07 1.745372e+06 0.0 3.281534e+08 4.525653e+05 0.000000
2012 0.0 0.0 0.0 0.0 0.000000e+00 0.0 1.425377e+07 222663.224270 2.587614e+07 0.0 ... 8435.069683 0.0 0.0 0.000000 1.262796e+07 1.253616e+05 0.0 7.944924e+08 1.180964e+03 0.000000
2013 0.0 0.0 0.0 0.0 2.945193e+05 0.0 2.125395e+06 0.000000 4.357466e+06 0.0 ... 957.605075 0.0 0.0 0.000000 4.438123e+06 1.035603e+05 0.0 4.272753e+08 2.855892e+05 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ZWE 2011 0.0 0.0 0.0 0.0 8.741233e+07 0.0 0.000000e+00 0.000000 0.000000e+00 0.0 ... 0.000000 0.0 0.0 0.000000 0.000000e+00 1.295280e+06 0.0 3.088036e+09 5.016118e+07 0.000000
2012 0.0 0.0 0.0 0.0 3.187848e+07 0.0 0.000000e+00 0.000000 0.000000e+00 0.0 ... 0.000000 0.0 0.0 0.000000 0.000000e+00 2.729399e+06 0.0 1.242790e+09 1.704507e+08 0.000000
2013 0.0 0.0 0.0 0.0 0.000000e+00 0.0 0.000000e+00 0.000000 0.000000e+00 0.0 ... 0.000000 0.0 0.0 0.000000 0.000000e+00 0.000000e+00 0.0 0.000000e+00 0.000000e+00 0.000000
2014 0.0 0.0 0.0 0.0 0.000000e+00 0.0 0.000000e+00 0.000000 0.000000e+00 0.0 ... 0.000000 0.0 0.0 0.000000 0.000000e+00 0.000000e+00 0.0 0.000000e+00 0.000000e+00 0.000000
2015 0.0 0.0 0.0 0.0 8.142369e+06 0.0 0.000000e+00 0.000000 0.000000e+00 0.0 ... 0.000000 0.0 0.0 0.000000 0.000000e+00 8.238550e+06 0.0 6.474374e+08 0.000000e+00 0.000000

604 rows × 46 columns

partner_features_AFR = pd.merge(left=partner_features_AFR.reset_index(), 
                                right=crosswalk[['ISO3166.3', 'UN_Intermediate_Region']].drop_duplicates('ISO3166.3'), 
                                how='left', 
                                left_on='reporter.ISO', right_on='ISO3166.3')

# KEEP THIS NARRATIVE TO EXPLAIN FUNCTION
noregion_mask = partner_features_AFR['UN_Intermediate_Region'].isnull()
countries_noregion = partner_features_AFR[noregion_mask]['reporter.ISO'].unique().tolist()
countries_noregion
['DZA', 'EGY', 'LBY', 'MAR', 'SDN', 'TUN']
crosswalk[crosswalk['ISO3166.3'].isin(countries_noregion)]['UN_Sub-region'].unique()
array(['Northern Africa'], dtype=object)
partner_features_AFR.loc[partner_features_AFR['reporter.ISO'].isin(countries_noregion), 'UN_Intermediate_Region'] = 'Northern Africa'

# region_labels = partner_features_AFR.reset_index().set_index('UN_Intermediate_Region').index.tolist()
partner_features_AFR = partner_features_AFR.set_index(['reporter.ISO', 'year']).drop(columns=['ISO3166.3', 'UN_Intermediate_Region'])
# region_labels
biplot_PCA(partner_features_AFR, partner_features_AFR.shape[1], 1, 2)

Remove outliers

features_data_std = StandardScaler().fit_transform(partner_features_AFR)
pca = PCA(n_components=partner_features_AFR.shape[1])
princ_comp = pca.fit_transform(features_data_std)
#outlying = (princ_comp[:,44] > 1.5) | (princ_comp[:,45] > 3e-15)
#outlying = (princ_comp[:,44] > 0.5) | (princ_comp[:,45] > 1.5e-15)
outlying = (princ_comp[:,0] > 6) | (princ_comp[:,1] > 4)
partner_features_AFR[outlying].index
MultiIndex([('AGO', '2010'),
            ('CIV', '2013'),
            ('EGY', '2012'),
            ('GHA', '2002'),
            ('MAR', '2005'),
            ('MAR', '2006'),
            ('MAR', '2007'),
            ('MAR', '2008'),
            ('MAR', '2009'),
            ('MAR', '2010'),
            ('MAR', '2011'),
            ('MAR', '2012'),
            ('NGA', '2011'),
            ('ZAF', '2005'),
            ('ZAF', '2006'),
            ('ZAF', '2008'),
            ('ZAF', '2010'),
            ('ZAF', '2011'),
            ('ZAF', '2012'),
            ('ZAF', '2013'),
            ('ZAF', '2014'),
            ('ZAF', '2015'),
            ('ZAF', '2016')],
           names=['reporter.ISO', 'year'])
partner_features_AFR_noout = partner_features_AFR[~outlying]
biplot_PCA(partner_features_AFR_noout, 46, 1, 2)

Coloring by class label

def region_labels(features_data):
    """
    TODO: add doc string
    """
    # Extract observation labels from features data
    obs_labels = pd.DataFrame(features_data.reset_index().set_index('reporter.ISO').index)

    # Merge with UN intermediate regions from crosswalk
    obs_labels = pd.merge(left=obs_labels,
                          right=crosswalk[['ISO3166.3', 'UN_Intermediate_Region']].drop_duplicates('ISO3166.3'), 
                          how='left', 
                          left_on='reporter.ISO', right_on='ISO3166.3') \
    .drop(columns='ISO3166.3')

    # Create a mask for the Northern African regions and replace the missing intermediate region
    noregion_mask = obs_labels['UN_Intermediate_Region'].isnull()
    countries_noregion = obs_labels[noregion_mask]['reporter.ISO'].unique().tolist()
    obs_labels.loc[obs_labels['reporter.ISO'].isin(countries_noregion), 'UN_Intermediate_Region'] = 'Northern Africa'

    return obs_labels
def biplot_PCA_classes(features_data, nPC=2, firstPC=1, secondPC=2, classes='UN_Intermediate_Region'):
    """
    TODO: update docstring
    Projects the data in the 2-dimensional space spanned by 2 principal components
    chosen by the user, along with a bi-plot of the top 3 loadings per PC, and colors
    by class label.

    Args:
        features_data: data-set of features
        nPC: number of principal components
        firstPC: integer denoting first principal component to plot in bi-plot
        secondPC: integer denoting second principal component to plot in bi-plot
        obs: string denoting index of class labels (in features_data)
        show_loadings: Boolean indicating whether PCA loadings should be displayed
    Returns:
        plot (interactive)
        pca_loadings (if show_loadings=True)
    """
    
    # Unit of observation is the reporting country
    obs='reporter.ISO'
    
    # Run PCA (standardize beforehand)
    features_data_std = StandardScaler().fit_transform(features_data)
#     features_data_std = power_transform(features_data, method='yeo-johnson', standardize=True)
    pca = PCA(n_components=nPC, random_state=234)
    princ_comp = pca.fit_transform(features_data_std)

    # Loadings
    cols = ['PC' + str(c+1) for c in np.arange(nPC)]
    pca_loadings = pd.DataFrame(pca.components_.T, 
                                columns=cols,
                                index=list(features_data.columns))
   
    # Scores
    pca_scores = pd.DataFrame(princ_comp, 
                              columns=cols)
    pca_scores[obs] = features_data.reset_index()[obs].values.tolist()
    pca_scores['year'] = features_data.reset_index()['year'].values.tolist()
    
    score_PC1 = princ_comp[:,firstPC-1]
    score_PC2 = princ_comp[:,secondPC-1]
    
    # Plot data
    plot_data = pd.merge(pca_scores, obs_info, on=[obs, 'year'])
    tooltip_obs = ['reporter', 'year', 'Income group (World Bank)', 'Country status (UN)']
    
    obs_labels = region_labels(features_data)
    plot_data = pd.merge(left=plot_data, right=obs_labels.drop_duplicates('reporter.ISO'), on='reporter.ISO')
    
    if classes == 'UN_Intermediate_Region':
        color_scheme = 'paired'
    else:
        color_scheme = 'dark2'

    # Return chosen PCs to plot
    PC1 = 'PC'+str(firstPC)
    PC2 = 'PC'+str(secondPC)
    
    # Top loadings (in absolute value)
    # TO DO: use dict to iterate over
    toploadings_PC1 = pca_loadings.apply(lambda x: abs(x)).sort_values(by=PC1).tail(3)[[PC1, PC2]]
    toploadings_PC2 = pca_loadings.apply(lambda x: abs(x)).sort_values(by=PC2).tail(3)[[PC1, PC2]]

    originsPC1 = pd.DataFrame({'index':toploadings_PC1.index.tolist(), 
                               PC1: np.zeros(3), 
                               PC2: np.zeros(3)})
    originsPC2 = pd.DataFrame({'index':toploadings_PC2.index.tolist(), 
                               PC1: np.zeros(3), 
                               PC2: np.zeros(3)})
    
    toploadings_PC1 = pd.concat([toploadings_PC1.reset_index(), originsPC1], axis=0)
    toploadings_PC2 = pd.concat([toploadings_PC2.reset_index(), originsPC2], axis=0)

    toploadings_PC1[PC1] = toploadings_PC1[PC1]*max(score_PC1)*1.5
    toploadings_PC1[PC2] = toploadings_PC1[PC2]*max(score_PC2)*1.5
    toploadings_PC2[PC1] = toploadings_PC2[PC1]*max(score_PC1)*1.5
    toploadings_PC2[PC2] = toploadings_PC2[PC2]*max(score_PC2)*1.5
    
    # Project top 3 loadings over the space spanned by 2 principal components
    lines = alt.Chart().mark_line().encode()
    for color, i, dataset in zip(['#440154FF', '#21908CFF'], [0,1], [toploadings_PC1, toploadings_PC2]):
        lines[i] = alt.Chart(dataset).mark_line(color=color).encode(
        x= PC1 +':Q',
        y= PC2 +':Q',
        detail='index'
    ).properties(
        width=500,
        height=500
    )
    
    # Add labels to the loadings
    text=alt.Chart().mark_text().encode()
    for color, i, dataset in zip(['#440154FF', '#21908CFF'], [0, 1], [toploadings_PC1[0:3], toploadings_PC2[0:3]]):
        text[i] = alt.Chart(dataset).mark_text(
                align='left',
                baseline='bottom',
                color=color
            ).encode(
                x= PC1 +':Q',
                y= PC2 +':Q',
                text='index'
            )
    
    # Scatter plot colored by observation class label
    points = alt.Chart(plot_data).mark_circle(size=60).encode(
        x=alt.X(PC1, axis=alt.Axis(title='Principal Component ' + str(firstPC))),
        y=alt.X(PC2, axis=alt.Axis(title='Principal Component ' + str(secondPC))),
        color=alt.Color(classes, scale=alt.Scale(scheme=color_scheme),
               legend=alt.Legend(orient='right')),
        tooltip=tooltip_obs
    ).interactive()
    
    chart = (points + lines[0] + lines[1] + text[0] + text[1])    
    chart.display()
biplot_PCA_classes(sector_features, 21, 1, 2, classes='UN_Intermediate_Region')
biplot_PCA_classes(sector_features, 21, 1, 2, classes='Income group (World Bank)')
biplot_PCA_classes(partner_features, 46, 1, 2)
biplot_PCA_classes(partner_features, 46, 1, 2, classes='Income group (World Bank)')
biplot_PCA_classes(partner_features_AFR, 46, 1, 2)
biplot_PCA_classes(partner_features_AFR, 46, 1, 2, classes='Income group (World Bank)')
biplot_PCA_classes(partner_features_AFR_noout, 46, 1, 2)
biplot_PCA_classes(partner_features_AFR_noout, 46, 1, 2, classes='Income group (World Bank)')

Clustering

# print(__doc__)

# import numpy as np

# from sklearn.cluster import DBSCAN
# from sklearn import metrics
# from sklearn.datasets import make_blobs
# from sklearn.preprocessing import StandardScaler


# # #############################################################################
# # Generate sample data
# centers = [[1, 1], [-1, -1], [1, -1]]
# X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
#                             random_state=0)

# X = StandardScaler().fit_transform(X)

# # #############################################################################
# # Compute DBSCAN
# db = DBSCAN(eps=0.3, min_samples=10).fit(X)
# core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True
# labels = db.labels_

# # Number of clusters in labels, ignoring noise if present.
# n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# n_noise_ = list(labels).count(-1)

# print('Estimated number of clusters: %d' % n_clusters_)
# print('Estimated number of noise points: %d' % n_noise_)
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
# print("Adjusted Rand Index: %0.3f"
#       % metrics.adjusted_rand_score(labels_true, labels))
# print("Adjusted Mutual Information: %0.3f"
#       % metrics.adjusted_mutual_info_score(labels_true, labels))
# print("Silhouette Coefficient: %0.3f"
#       % metrics.silhouette_score(X, labels))

# # #############################################################################
# # Plot result
# import matplotlib.pyplot as plt

# # Black removed and is used for noise instead.
# unique_labels = set(labels)
# colors = [plt.cm.Spectral(each)
#           for each in np.linspace(0, 1, len(unique_labels))]
# for k, col in zip(unique_labels, colors):
#     if k == -1:
#         # Black used for noise.
#         col = [0, 0, 0, 1]

#     class_member_mask = (labels == k)

#     xy = X[class_member_mask & core_samples_mask]
#     plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
#              markeredgecolor='k', markersize=14)

#     xy = X[class_member_mask & ~core_samples_mask]
#     plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
#              markeredgecolor='k', markersize=6)

# plt.title('Estimated number of clusters: %d' % n_clusters_)
# plt.show()
from sklearn.cluster import DBSCAN

features_data_std = StandardScaler().fit_transform(partner_features_AFR_noout)
pca = PCA(n_components=10)
princ_comp = pca.fit_transform(features_data_std)
X = princ_comp[:,[0,1]]
    
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
# print('Estimated number of noise points: %d' % n_noise_)
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
# print("Adjusted Rand Index: %0.3f"
#       % metrics.adjusted_rand_score(labels_true, labels))
# print("Adjusted Mutual Information: %0.3f"
#       % metrics.adjusted_mutual_info_score(labels_true, labels))
# print("Silhouette Coefficient: %0.3f"
#       % metrics.silhouette_score(X, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
Estimated number of clusters: 3
_images/Unsupervised-learning-trade_119_1.png
features_data_std = StandardScaler().fit_transform(partner_features_AFR_noout)
pca = PCA(n_components=10)
princ_comp = pca.fit_transform(features_data_std)
X = princ_comp[:,[0,1]]

from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=5).fit(X)
clustering.labels_
plt.scatter(X[:, 0], X[:, 1], c=clustering.labels_);
_images/Unsupervised-learning-trade_120_0.png

Hierarchical clustering and corresponding heatmap

# Create data for African bilateral trade matrix in 2016
data = partner_features_AFR_noout.reset_index() \
    .query('year == "2016"').drop(columns='year').set_index('reporter.ISO')

# Select the columns which have at least 1 non-zero row 
data = data.loc[:,(data != 0).any(axis=0)]

# Log the data
data = data.apply(lambda x: np.log10(x+1) if np.issubdtype(x.dtype, np.number) else x, axis=0)

# Scale the data
scaler = StandardScaler()
# scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)
data_scaled = pd.DataFrame(data_scaled, index=data.index, columns=data.columns)

# Extract region labels for the observations
obs_labels = region_labels(data).set_index('reporter.ISO')

# Extract unique regions
color_labels = obs_labels['UN_Intermediate_Region'].unique()
color_labels = np.sort(color_labels)

# List of RGB triplets for the region labels
rgb_values = sns.color_palette('Paired', 5)

# Map region labels to RGB triplet
color_map = dict(zip(color_labels, rgb_values))
region_color = pd.DataFrame(obs_labels)['UN_Intermediate_Region'].map(color_map)
# Plot heatmap and corresponding dendograms
g = sns.clustermap(data_scaled,
                   method='ward',
                   cmap='bone_r',
                   row_colors=region_color,
                  );
g.fig.suptitle('Hierarchical clustering with Ward linkage', y=1);
_images/Unsupervised-learning-trade_123_0.png

Extra stuff (archive)

# cols = ['PC' + str(c+1) for c in np.arange(nPC)]
# pca_scores = pd.DataFrame(princ_comp,
#                           columns=cols,
#                           index=target,)
# pca_scores['class'] = target

# sns.lmplot(x='PC1', 
#            y='PC2', 
#            data=pca_scores, fit_reg=False, hue='class');
# plt.figure()
# lw = 2

# for i, target_name in zip(list(rng), target_names):
#     plt.scatter(princ_comp[y == i, 0], princ_comp[y == i, 1], 
#                 alpha=.8, lw=lw,
#                 label=target_name)
# plt.legend(loc='best', shadow=False, scatterpoints=1)
# plt.title('PCA of trade dataset');
# DO NOT MODIFY
def biplot_PCA_static(features_data, nPC=2, firstPC=1, secondPC=2, obs='reporter.ISO', show_loadings=False):
    """Projects the data in the 2-dimensional space spanned by 2 principal components
    chosen by the user, along with a bi-plot of the top 3 loadings per PC, and colors
    by class label.
    
    Args:
        features_data: data-set of features
        nPC: number of principal components
        firstPC: integer denoting first principal component to plot in bi-plot
        secondPC: integer denoting second principal component to plot in bi-plot
        obs: string denoting index of class labels (in features_data)
        show_loadings: Boolean indicating whether PCA loadings should be displayed
    Returns:
        plot (static)
        pca_loadings (if show_loadings=True)
    """
    
    # Run PCA (standardize beforehand)
    features_data_std = StandardScaler().fit_transform(features_data)
    pca = PCA(n_components=nPC)
    princ_comp = pca.fit_transform(features_data_std)

    # Loadings
    cols = ['PC' + str(c+1) for c in np.arange(nPC)]
    pca_loadings = pd.DataFrame(pca.components_.T, 
                                columns=cols,
                                index=list(features_data.columns))

    # Scores
    score_PC1 = princ_comp[:,firstPC-1]
    score_PC2 = princ_comp[:,secondPC-1]
    
    pca_scores = pd.DataFrame(princ_comp, 
                              columns=cols)
    
    # Return chosen PCs to plot
    PC1 = 'PC'+str(firstPC)
    PC2 = 'PC'+str(secondPC)

    # Top loadings (in absolute value)
#     toploadings_PC1 = pca_loadings.nlargest(3, PC1)
#     toploadings_PC2 = pca_loadings.nlargest(3, PC2)
    toploadings_PC1 = pca_loadings.apply(lambda x: abs(x)).sort_values(by=PC1).tail(3)
    toploadings_PC2 = pca_loadings.apply(lambda x: abs(x)).sort_values(by=PC2).tail(3)

    # Plot bi-plot
    plt.figure(figsize=(10,10))
    for i in range(toploadings_PC1.shape[0]):
        plt.arrow(0, 0, 
                  toploadings_PC1[PC1][i]*max(score_PC1)*1.5, 
                  toploadings_PC1[PC2][i]*max(score_PC2)*1.5,
                  color='r', width=0.0005, head_width=0.0025)
        plt.text(toploadings_PC1[PC1][i]*max(score_PC1)*1.5, 
                 toploadings_PC1[PC2][i]*max(score_PC2)*1.5,
                 toploadings_PC1.index.tolist()[i], color='r')
    for i in range(toploadings_PC2.shape[0]):
        plt.arrow(0, 0, 
                  toploadings_PC2[PC1][i]*max(score_PC1)*1.5, 
                  toploadings_PC2[PC2][i]*max(score_PC2)*1.5,
                  color='g', width=0.0005, head_width=0.0025)
        plt.text(toploadings_PC2[PC1][i]*max(score_PC1)*1.5, 
                 toploadings_PC2[PC2][i]*max(score_PC2)*1.5,
                 toploadings_PC2.index.tolist()[i], color='g')
        
    # Extract class labels
    target = features_data.reset_index().set_index(obs).index.tolist()
    y = pd.Series(target).astype('category').cat.codes.values
    target_names = np.unique(target)
    rng = np.arange(0, len(target_names))
    
    # Add points
    for i, target_name in zip(list(rng), target_names):
         plt.scatter(princ_comp[y == i, firstPC-1], princ_comp[y == i, secondPC-1], 
                     alpha=.5, lw=2)
    plt.xlabel('Principal Component ' + str(firstPC))
    plt.ylabel('Principal Component ' + str(secondPC))
    
    if show_loadings:
        return pca_loadings
biplot_PCA_static(sector_features_log, 10, 1, 2, obs='reporter.ISO')
_images/Unsupervised-learning-trade_128_0.png
# ! jupyter nbconvert Unsupervised-learning-trade.ipynb --to slides --SlidesExporter.reveal_scroll=True

Concluding thoughts

  • Target 16.4 of the SDGs: “By 2030, significantly reduce illicit financial and arms flows, strengthen the recovery and return of stolen assets and combat all forms of organized crime”

  • Increasing international policy attention to the problem has lead to demands for more accurate estimates.

  • Accurate estimates of the nature and of the scale of the problem can support policy prioritization and targeted intervention to combat illicit flows.

Next steps

  • hierarchical clustering (and reordered heatmap)

  • explore clustering a bit more

  • NMF on bilateral trade matrix which also includes sectors, so unit of observation is dyad, and features are sectors

  • Merge in GDP and (maybe?) distance measures to see if NMF recovers gravity interpretation of international trade

Network analysis

Data wrangling

indicators = {'CC.PER.RNK': 'control-corruption', 
              'RL.PER.RNK': 'rule-of-law',
              'NY.GDP.PCAP.CD': 'gdp-pc',
              'SP.RUR.TOTL.ZS': 'rural-pop',
              'AG.LND.AGRI.ZS' : 'agric-land'}
covariates = wb.get_dataframe(indicators,  
                              data_date=(datetime.datetime(2016, 1, 1)))
covariates
control-corruption rule-of-law gdp-pc rural-pop agric-land
Afghanistan 3.365385 5.769231 547.228110 74.980000 58.067580
Albania 40.865380 41.826920 4124.108907 41.579000 43.127735
Algeria 27.884610 18.750000 3946.421445 28.541000 17.365539
American Samoa 87.500000 87.980770 11696.955562 12.802000 24.500000
Andorra 87.500000 90.865390 37224.108916 11.752000 39.957448
... ... ... ... ... ...
West Bank and Gaza 51.923080 40.865380 3074.291152 24.372000 49.322261
World NaN NaN 10258.006620 45.630154 37.430740
Yemen, Rep. 1.442308 3.365385 1139.870568 64.606000 44.597231
Zambia 41.346150 43.269230 1280.578447 57.562000 32.063923
Zimbabwe 9.615385 8.173077 1464.583529 67.704000 41.876696

273 rows × 5 columns

covariates = pd.merge(left=covariates.reset_index().rename(columns={'index': 'country'}), 
                      right=crosswalk[['country', 'ISO3166.3']], 
                      how='left', on='country').rename(columns={'date': 'year'})
covariates = covariates.dropna(subset=['ISO3166.3']).set_index('ISO3166.3')
def create_graph_data(year='2016', threshold=True, threshold_var='GDP', flow_threshold=10000):
    """TO DO: add doc string and comments"""
    flow_data = IFF_Dest.reset_index().query('year == @year')
    flow_data = flow_data.loc[flow_data['Imp_IFF_hi'].notnull(), :]
    flow_data = flow_data.query('Imp_IFF_hi >= @flow_threshold')
    
    if threshold:
        conduits = 'conduits_' + threshold_var
        flow_data = flow_data[flow_data['reporter.ISO'].isin(eval(conduits).index)]
    return flow_data

Thresholding

panel = pyreadr.read_r('Data/GER_Orig_Year_Africa.Rdata')
IFF_Year = panel['GER_Orig_Year_Africa']
IFF_Year = IFF_Year.query('year == "2016"')
sns.distplot(IFF_Year['Tot_IFF_hi_GDP'], kde=True, label='% GDP', bins=10);
sns.distplot(IFF_Year['Tot_IFF_hi_trade'], kde=True, label='% trade', bins=10);
plt.legend()
plt.title('Distribution of IFF in Africa in 2016')
plt.xlabel('Mis-invoiced trade for African countries');
_images/Unsupervised-learning-trade_139_0.png
thresh_GDP = 0.1
thresh_trade = 0.18
conduits_GDP = IFF_Year.query('Tot_IFF_hi_GDP >= @thresh_GDP').set_index('reporter.ISO')
conduits_trade = IFF_Year.query('Tot_IFF_hi_trade >= @thresh_trade').set_index('reporter.ISO')
conduits_GDP
reporter rIncome rDev year Imp_IFF_lo Imp_IFF_hi Exp_IFF_lo Exp_IFF_hi Tot_IFF_lo Tot_IFF_hi Tot_IFF_lo_bn Tot_IFF_hi_bn GDP GNPpc Tot_IFF_lo_GDP Tot_IFF_hi_GDP Total_value Tot_IFF_lo_trade Tot_IFF_hi_trade
reporter.ISO
MLI Mali LIC Developing 2016 1.133595e+08 1.429417e+09 5.662107e+06 4.662387e+02 1.190216e+08 1.429417e+09 0.119022 1.429417 1.401079e+10 770.0 0.008495 0.102023 6.788468e+09 0.017533 0.210566
MOZ Mozambique LIC Developing 2016 1.797333e+08 1.556931e+09 6.524167e+06 1.277078e+04 1.862574e+08 1.556944e+09 0.186257 1.556944 1.098136e+10 480.0 0.016961 0.141781 8.647422e+09 0.021539 0.180047
SYC Seychelles HIC Developing 2016 6.343312e+07 3.096034e+08 3.134127e+04 7.357442e+00 6.346446e+07 3.096034e+08 0.063464 0.309603 1.427525e+09 13830.0 0.044458 0.216881 2.271627e+09 0.027938 0.136291
TUN Tunisia LMC Developing 2016 1.349524e+09 4.200118e+09 2.546233e+07 1.217210e+06 1.374987e+09 4.201335e+09 1.374987 4.201335 4.180838e+10 3720.0 0.032888 0.100490 3.306234e+10 0.041588 0.127073
conduits_trade
reporter rIncome rDev year Imp_IFF_lo Imp_IFF_hi Exp_IFF_lo Exp_IFF_hi Tot_IFF_lo Tot_IFF_hi Tot_IFF_lo_bn Tot_IFF_hi_bn GDP GNPpc Tot_IFF_lo_GDP Tot_IFF_hi_GDP Total_value Tot_IFF_lo_trade Tot_IFF_hi_trade
reporter.ISO
BDI Burundi LIC Developing 2016 2.124794e+07 2.358957e+08 1.938189e+05 3.834845e+03 2.144176e+07 2.358996e+08 0.021442 0.235900 2.959185e+09 270.0 0.007246 0.079718 7.543783e+08 0.028423 0.312707
MLI Mali LIC Developing 2016 1.133595e+08 1.429417e+09 5.662107e+06 4.662387e+02 1.190216e+08 1.429417e+09 0.119022 1.429417 1.401079e+10 770.0 0.008495 0.102023 6.788468e+09 0.017533 0.210566
MOZ Mozambique LIC Developing 2016 1.797333e+08 1.556931e+09 6.524167e+06 1.277078e+04 1.862574e+08 1.556944e+09 0.186257 1.556944 1.098136e+10 480.0 0.016961 0.141781 8.647422e+09 0.021539 0.180047
STP São Tomé and Príncipe LMC Developing 2016 1.121335e+07 3.065321e+07 0.000000e+00 7.278793e+01 1.121335e+07 3.065328e+07 0.011213 0.030653 3.542375e+08 1730.0 0.031655 0.086533 1.498260e+08 0.074843 0.204593
UGA Uganda LIC Developing 2016 2.273814e+08 1.816069e+09 1.056252e+07 2.457472e+06 2.379439e+08 1.818526e+09 0.237944 1.818526 2.413366e+10 630.0 0.009859 0.075352 7.802353e+09 0.030496 0.233074
flow_data = create_graph_data('2016', threshold_var='trade')
panel = pyreadr.read_r('Data/GER_Orig_Dest_Year_std.Rdata')
IFF_std = panel['GER_Orig_Dest_Year_std']
IFF_std = IFF_std.query('year == "2016"')
IFF_std.describe()
year Imp_IFF_lo Imp_IFF_hi Exp_IFF_lo Exp_IFF_hi rGDP rGNPpc pGDP pGNPpc rImp_IFF_hi_GDP pImp_IFF_hi_GDP
count 7643.0 4.619000e+03 5.058000e+03 4.414000e+03 4.890000e+03 7.604000e+03 7545.000000 7.605000e+03 7546.000000 5.019000e+03 5.056000e+03
mean 2016.0 1.558822e+08 3.134047e+08 2.517679e+07 6.720660e+06 9.307257e+11 19378.532803 9.305462e+11 19367.327061 1.258651e-03 7.896162e-04
std 0.0 1.660777e+09 2.185755e+09 2.624184e+08 8.839573e+07 2.684124e+12 20773.046364 2.683995e+12 20777.406284 5.618831e-03 4.534438e-03
min 2016.0 1.085531e-01 7.092861e+00 1.178560e-01 1.196589e-05 3.039845e+08 270.000000 3.039845e+08 270.000000 1.988676e-11 2.967606e-11
25% 2016.0 1.417457e+05 8.710390e+05 4.058444e+04 1.333348e+03 2.413366e+10 3790.000000 2.413366e+10 3790.000000 9.718551e-06 6.236042e-06
50% 2016.0 1.906819e+06 9.309482e+06 4.923252e+05 3.068387e+04 1.916397e+11 9710.000000 1.916397e+11 9710.000000 1.067327e-04 4.755409e-05
75% 2016.0 2.190065e+07 7.545741e+07 4.539899e+06 4.243309e+05 5.548609e+11 36460.000000 5.548609e+11 36460.000000 5.956323e-04 2.973232e-04
max 2016.0 9.009745e+10 7.971280e+10 9.834296e+09 4.695487e+09 1.870719e+13 82130.000000 1.870719e+13 82130.000000 1.350083e-01 1.430648e-01
flow_data = pd.merge(left=flow_data,
                     right=IFF_std[['reporter.ISO', 'partner.ISO', 'pImp_IFF_hi_GDP']],
                     on=['reporter.ISO', 'partner.ISO'])
sns.distplot(flow_data['pImp_IFF_hi_GDP'], kde=True);
plt.title('Distribution of IFF in partner countries')
plt.xlabel('Mis-invoiced trade as proportion of GDP in partner countries');
_images/Unsupervised-learning-trade_145_0.png
flow_data = flow_data.query('pImp_IFF_hi_GDP >= 0.0001')

Create directed graph

graph = nx.from_pandas_edgelist(flow_data,
                                'reporter.ISO',
                                'partner.ISO', 
                                'Imp_IFF_hi',
                                create_using = nx.DiGraph())
# Check the number of unique reporters and partners
pd.unique(flow_data[['reporter.ISO', 'partner.ISO']].values.ravel('K'))
len(pd.unique(flow_data[['reporter.ISO', 'partner.ISO']].values.ravel('K')))
20
# Check the number of nodes and edges in the graph
len(graph.nodes()), len(graph.edges())
(20, 22)
# Print some of the edges in the network
list(graph.edges(data=True))[0:10]
[('BDI', 'TZA', {'Imp_IFF_hi': 19364537.447332628}),
 ('BDI', 'UGA', {'Imp_IFF_hi': 14643052.004823904}),
 ('UGA', 'BDI', {'Imp_IFF_hi': 479170.6143764456}),
 ('UGA', 'IND', {'Imp_IFF_hi': 458125729.8877268}),
 ('UGA', 'IDN', {'Imp_IFF_hi': 205694749.4252262}),
 ('UGA', 'MYS', {'Imp_IFF_hi': 43635085.092753015}),
 ('UGA', 'MUS', {'Imp_IFF_hi': 1272186.476237092}),
 ('UGA', 'ZAF', {'Imp_IFF_hi': 129613707.17569979}),
 ('UGA', 'TZA', {'Imp_IFF_hi': 35050167.522962645}),
 ('MLI', 'GHA', {'Imp_IFF_hi': 8081515.073730136})]
# Get the degree of the nodes
graph.degree
DiDegreeView({'BDI': 3, 'TZA': 2, 'UGA': 8, 'MLI': 5, 'GHA': 1, 'SEN': 1, 'ZAF': 3, 'TGO': 1, 'MOZ': 7, 'NAM': 1, 'NLD': 1, 'PRT': 2, 'SGP': 1, 'STP': 1, 'IND': 1, 'IDN': 1, 'MYS': 1, 'MUS': 2, 'MRT': 1, 'FIN': 1})
# Create dictionary of GDP per capita for each country
GDP_attr = covariates.loc[:, ['gdp-pc']]
GDP_attr = GDP_attr.to_dict('index')
list(GDP_attr.items())[0:10]
[('AFG', {'gdp-pc': 547.228110150363}),
 ('ALB', {'gdp-pc': 4124.10890718622}),
 ('DZA', {'gdp-pc': 3946.4214445883}),
 ('ASM', {'gdp-pc': 11696.9555623329}),
 ('AND', {'gdp-pc': 37224.1089162924}),
 ('AGO', {'gdp-pc': 3506.07288506966}),
 ('AIA', {'gdp-pc': nan}),
 ('ATG', {'gdp-pc': 15197.6174551735}),
 ('ARG', {'gdp-pc': 12790.2424732447}),
 ('ARM', {'gdp-pc': 3591.82927553023})]
# Set GDP per capita as a node attribute
nx.set_node_attributes(graph, GDP_attr)
list(graph.nodes(data=True))[0:10]
[('BDI', {'gdp-pc': 282.193130404814}),
 ('TZA', {'gdp-pc': 966.474622354379}),
 ('UGA', {'gdp-pc': 608.705735097409}),
 ('MLI', {'gdp-pc': 779.874933008414}),
 ('GHA', {'gdp-pc': 1931.38947036943}),
 ('SEN', {'gdp-pc': 1269.04040702421}),
 ('ZAF', {'gdp-pc': 5272.91842475419}),
 ('TGO', {'gdp-pc': 597.471088804109}),
 ('MOZ', {'gdp-pc': 428.926487995524}),
 ('NAM', {'gdp-pc': 4786.23530295128})]
graph.nodes['MLI']
{'gdp-pc': 779.874933008414}
graph.nodes['MLI']['gdp-pc']
779.874933008414
# Create dictionary of longitude and latitude for countries
pos = {country: (v['Longitude'], v['Latitude'])
       for country, v in
       crosswalk.drop_duplicates('ISO3166.3').set_index('ISO3166.3').to_dict('index').items()
}
list(pos.items())[0:10]
[('ABW', (-69.98267711, 12.52088038)),
 ('AFG', (66.00473366, 33.83523073)),
 ('AGO', (17.53736768, -12.29336054)),
 ('AIA', (-63.06498927, 18.2239595)),
 ('ALA', (19.95328768, 60.21488688)),
 ('ALB', (20.04983396, 41.14244989)),
 ('AND', (1.56054378, 42.54229102)),
 ('ANT', (nan, nan)),
 ('ARE', (54.300167099999996, 23.90528188)),
 ('ARG', (-65.17980692, -35.3813488))]
# Create list of colors for nodes in the graph
col = [nx.get_node_attributes(graph, 'gdp-pc')[n] for n in graph.nodes]
print(np.min(col), np.max(col))
282.193130404814 56724.1703858863
# Create list of colors for edges in the graph
edge_col = [nx.get_edge_attributes(graph, 'Imp_IFF_hi')[e] for e in graph.edges]
edge_col = np.log(edge_col)
# Draw directed graph and color nodes by GDP per capita
plt.figure(figsize = (10,10))
nx.draw_networkx(graph,
#                  width=0.5,
                 node_color=col, cmap=plt.cm.spring_r,
                 edge_color=edge_col, edge_cmap=plt.cm.get_cmap('RdPu'))
plt.show();
_images/Unsupervised-learning-trade_160_0.png
# Extract outdegree (the number of edges coming out of nodes) and degree
outdeg = graph.out_degree
deg = graph.degree
# Size of nodes will be proportional to their degree
sizes = [10 * deg[c] for c in graph.nodes]
# Label the countries if their outdegree is at least 1, i.e. if they are the reporting African countries
labels = {c: c if outdeg[c] >= 1 else ''
          for c in graph.nodes}
# Map projection
crs = ccrs.PlateCarree()
fig, ax = plt.subplots(1, 1, figsize=(20, 8), subplot_kw=dict(projection=crs))
ax.coastlines(color='lightgray')
ax.add_feature(cfeature.LAND, facecolor='lightgray')
ax.add_feature(cfeature.BORDERS, edgecolor='white')
# ax.set_extent([-160, 180, -60, 90], crs=ccrs.PlateCarree())

# Overlay network graph
nx.draw_networkx(graph, ax=ax,
                 pos=pos,
                 font_size=14,
                 alpha=0.7,
#                  width=0.5,
                 node_color=col, cmap=plt.cm.spring_r,
                 edge_color=edge_col, edge_cmap=plt.cm.get_cmap('RdPu'),
                 node_size=sizes,
                 labels=labels)
_images/Unsupervised-learning-trade_162_0.png